Contents

library(Matrix)
library(scran)
library(Rtsne)
#set it up for scran to be properly parallelised
library(BiocParallel)
ncores = 16
mcparam = SnowParam(workers = ncores)
register(mcparam)
library(irlba)
library(cowplot)

source("/nfs/research1/marioni/jonny/embryos/scripts/core_functions.R")
load_data(remove_doublets = TRUE, remove_stripped = TRUE)

nPC = 50

# #subset for testing
# sce = normalize(sce[,meta$stage %in% c("E7.25", "E7.5")])
# meta = meta[meta$stage %in% c("E7.25", "E7.5"),]

In this script, we perform batch correction on our data.

1 Batch correction

For batch correction, we employ the scran function fastMNN, which performs batch correction in the manner of mnnCorrect, but in the PC-space, and much faster. Specifically, we will perform 3 types of correction:

  1. Correction of entire dataset, using PCs calculated over all cells using HVGs from all cells

  2. Correction per timepoint, using PCs calculated for cells of each timepoint, using HVGs calculated at each timepoint

  3. Correction per Theiler stage, using PCs calculated for cells of each Theiler stage, using HVGs calculated at each Theiler stage

Three types of correction are important: considering each timepoint/Theiler stage separately will provide a greater resolution of the structure of the data rather than subsetting coordinates from a single large PCA.

1.1 Total correction

hvgs = getHVGs(sce)
## Loading required package: biomaRt
#get order: oldest to youngest; most cells to least cells
order_df = meta[!duplicated(meta$sample), c("stage", "sample")]
order_df$ncells = sapply(order_df$sample, function(x) sum(meta$sample == x))
order_df$stage = factor(order_df$stage, 
                        levels = rev(c("E8.5", 
                                   "E8.25", 
                                   "E8.0", 
                                   "E7.75", 
                                   "E7.5", 
                                   "E7.25", 
                                   "mixed_gastrulation", 
                                   "E7.0", 
                                   "E6.75", 
                                   "E6.5")))
order_df = order_df[order(order_df$stage, order_df$ncells, decreasing = TRUE),]
order_df$stage = as.character(order_df$stage)

all_correct = doBatchCorrect(counts = logcounts(sce)[rownames(sce) %in% hvgs,], 
                             timepoints = meta$stage, 
                             samples = meta$sample, 
                             timepoint_order = order_df$stage, 
                             sample_order = order_df$sample, 
                             npc = 50,
                             BPPARAM = mcparam)


save(all_correct, file = "/nfs/research1/marioni/jonny/embryos/scripts/batch_correct/all_correction.RData")

A t-SNE visualisation of all cells, pre- and post-correction, is shown in Figure 1.

base_pca = prcomp_irlba(t(logcounts(sce)[rownames(sce) %in% hvgs,]), n = nPC)$x
base_tsne = Rtsne(base_pca, pca = FALSE)$Y
corrected_tsne = Rtsne(all_correct, pca = FALSE)$Y

base_tsne = as.data.frame(base_tsne)
base_tsne$sample = meta$sample
base_tsne$state = "Uncorrected"

corrected_tsne = as.data.frame(corrected_tsne)
corrected_tsne$sample = meta$sample
corrected_tsne$state = "Corrected"

bc_tsne = rbind(base_tsne, corrected_tsne)

reorder = sample(nrow(bc_tsne), nrow(bc_tsne))

ggplot(bc_tsne[reorder,], aes(x = V1, y = V2, col = factor(sample))) +
  geom_point(size = 0.4) +
  scale_colour_Publication() +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  facet_wrap(~state, nrow = 2)
Correction over all data.

Figure 1: Correction over all data

1.2 Timepoint correction

stage_corrections = lapply(unique(meta$stage), function(x){
  sub_sce = normalize(sce[,meta$stage == x])
  sub_meta = meta[meta$stage == x,]
  hvgs = getHVGs(sub_sce)
  
  correct = doBatchCorrect(counts = logcounts(sub_sce)[rownames(sub_sce) %in% hvgs,], 
                           timepoints = sub_meta$stage, 
                           samples = sub_meta$sample, 
                           timepoint_order = order_df$stage, 
                           sample_order = order_df$sample, 
                           npc = 50,
                           BPPARAM = mcparam)
  return(correct)
  
})
names(stage_corrections) = unique(meta$stage)

t-SNE visualisations of cells at each timepoint, pre- and post-correction, are shown in Figure 2. Corrections were computed in the scope of each timepoint exclusively (i.e. for calculation of HVGs, PCA).

tsnes = lapply(stage_corrections, Rtsne, pca = FALSE)
tsnes = lapply(tsnes, function(x) as.data.frame(x$Y))
for(i in 1:length(tsnes)){
  tsnes[[i]]$stage = unique(meta$stage)[i]
  tsnes[[i]]$sample = meta$sample[meta$stage == unique(meta$stage)[i]]
  tsnes[[i]]$state = "Corrected"
}

base_pcas = lapply(unique(meta$stage), function(x){
  sub_sce = normalize(sce[,meta$stage == x])
  sub_meta = meta[meta$stage == x,]
  hvgs = getHVGs(sub_sce)
  
  pca = prcomp_irlba(t(logcounts(sub_sce)[rownames(sub_sce) %in% hvgs,]), n = nPC)
  return(pca$x)
})
base_tsnes = lapply(base_pcas, Rtsne, pca = FALSE)
base_tsnes = lapply(base_tsnes, function(x) as.data.frame(x$Y))
for(i in 1:length(base_tsnes)){
  base_tsnes[[i]]$stage = unique(meta$stage)[i]
  base_tsnes[[i]]$sample = meta$sample[meta$stage == unique(meta$stage)[i]]
  base_tsnes[[i]]$state = "Uncorrected"
}

big_df = rbind(
  do.call(rbind, base_tsnes),
  do.call(rbind, tsnes)
)

reorder = sample(nrow(big_df), nrow(big_df))

ggplot(big_df[reorder,], aes(x = V1, y = V2, col = factor(sample))) +
  geom_point(size = 0.4) +
  scale_colour_Publication() +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  facet_grid(stage~state)
Correction computed for each timepoint separately.

Figure 2: Correction computed for each timepoint separately

1.3 Theiler stage correction

theiler_corrections = lapply(unique(meta$theiler), function(x){
  sub_sce = normalize(sce[,meta$theiler == x])
  sub_meta = meta[meta$theiler == x,]
  hvgs = getHVGs(sub_sce)
  
  correct = doBatchCorrect(counts = logcounts(sub_sce)[rownames(sub_sce) %in% hvgs,], 
                           timepoints = sub_meta$stage, 
                           samples = sub_meta$sample, 
                           timepoint_order = order_df$stage, 
                           sample_order = order_df$sample, 
                           npc = 50,
                           BPPARAM = mcparam)
  return(correct)
  
})

names(theiler_corrections) = unique(meta$theiler)

t-SNE visualisations of cells at each Theiler stage, pre- and post-correction, are shown in Figure 3. Corrections were computed in the scope of each Theiler stage exclusively (i.e. for calculation of HVGs, PCA).

tsnes = lapply(theiler_corrections, Rtsne, pca = FALSE)
tsnes = lapply(tsnes, function(x) as.data.frame(x$Y))
for(i in 1:length(tsnes)){
  tsnes[[i]]$theiler = unique(meta$theiler)[i]
  tsnes[[i]]$sample = meta$sample[meta$theiler == unique(meta$theiler)[i]]
  tsnes[[i]]$state = "Corrected"
}

base_pcas = lapply(unique(meta$theiler), function(x){
  sub_sce = normalize(sce[,meta$theiler == x])
  sub_meta = meta[meta$theiler == x,]
  hvgs = getHVGs(sub_sce)
  
  pca = prcomp_irlba(t(logcounts(sub_sce)[rownames(sub_sce) %in% hvgs,]), n = nPC)
  return(pca$x)
})
base_tsnes = lapply(base_pcas, Rtsne, pca = FALSE)
base_tsnes = lapply(base_tsnes, function(x) as.data.frame(x$Y))

for(i in 1:length(base_tsnes)){
  base_tsnes[[i]]$theiler = unique(meta$theiler)[i]
  base_tsnes[[i]]$sample = meta$sample[meta$theiler == unique(meta$theiler)[i]]
  base_tsnes[[i]]$state = "Uncorrected"
}

big_df = rbind(
  do.call(rbind, base_tsnes),
  do.call(rbind, tsnes)
)

reorder = sample(nrow(big_df), nrow(big_df))

ggplot(big_df[reorder,], aes(x = V1, y = V2, col = factor(sample))) +
  geom_point(size = 0.4) +
  scale_colour_Publication() +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  facet_grid(theiler~state)
Correction computed for each Theiler stage.

Figure 3: Correction computed for each Theiler stage

corrections = list("all" = all_correct, "theiler" = theiler_corrections, "stage" = stage_corrections)
saveRDS(corrections, file = "/nfs/research1/marioni/jonny/embryos/data/corrected_pcas.rds")

2 Session Info

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
## 
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] scales_0.5.0                biomaRt_2.37.3             
##  [3] scater_1.9.10               cowplot_0.9.3              
##  [5] ggplot2_3.0.0               irlba_2.3.2                
##  [7] Rtsne_0.13                  scran_1.9.13               
##  [9] SingleCellExperiment_1.3.9  SummarizedExperiment_1.11.6
## [11] DelayedArray_0.7.21         matrixStats_0.54.0         
## [13] Biobase_2.41.2              GenomicRanges_1.33.11      
## [15] GenomeInfoDb_1.17.1         IRanges_2.15.16            
## [17] S4Vectors_0.19.19           BiocGenerics_0.27.1        
## [19] BiocParallel_1.15.8         Matrix_1.2-14              
## [21] BiocStyle_2.9.3             rmarkdown_1.10             
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6             bit64_0.9-7             
##  [3] httr_1.3.1               progress_1.2.0          
##  [5] rprojroot_1.3-2          dynamicTreeCut_1.63-1   
##  [7] tools_3.5.0              backports_1.1.2         
##  [9] R6_2.2.2                 vipor_0.4.5             
## [11] DBI_1.0.0                lazyeval_0.2.1          
## [13] colorspace_1.3-2         withr_2.1.2             
## [15] prettyunits_1.0.2        tidyselect_0.2.4        
## [17] gridExtra_2.3            bit_1.1-14              
## [19] compiler_3.5.0           labeling_0.3            
## [21] bookdown_0.7             stringr_1.3.1           
## [23] digest_0.6.15            XVector_0.21.3          
## [25] pkgconfig_2.0.1          htmltools_0.3.6         
## [27] highr_0.7                limma_3.37.3            
## [29] rlang_0.2.1              RSQLite_2.1.1           
## [31] DelayedMatrixStats_1.3.4 bindr_0.1.1             
## [33] dplyr_0.7.6              RCurl_1.95-4.11         
## [35] magrittr_1.5             GenomeInfoDbData_1.1.0  
## [37] Rcpp_0.12.18             ggbeeswarm_0.6.0        
## [39] munsell_0.5.0            Rhdf5lib_1.3.1          
## [41] viridis_0.5.1            stringi_1.2.4           
## [43] yaml_2.2.0               edgeR_3.23.3            
## [45] zlibbioc_1.27.0          rhdf5_2.25.4            
## [47] plyr_1.8.4               grid_3.5.0              
## [49] blob_1.1.1               crayon_1.3.4            
## [51] lattice_0.20-35          hms_0.4.2               
## [53] locfit_1.5-9.1           knitr_1.20              
## [55] pillar_1.3.0             igraph_1.2.1            
## [57] rjson_0.2.20             kmknn_0.99.16           
## [59] reshape2_1.4.3           XML_3.98-1.12           
## [61] glue_1.3.0               evaluate_0.11           
## [63] data.table_1.11.4        gtable_0.2.0            
## [65] purrr_0.2.5              assertthat_0.2.0        
## [67] xfun_0.3                 viridisLite_0.3.0       
## [69] snow_0.4-2               tibble_1.4.2            
## [71] AnnotationDbi_1.43.1     beeswarm_0.2.3          
## [73] memoise_1.1.0            tximport_1.9.8          
## [75] bindrcpp_0.2.2           statmod_1.4.30